import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
print("โ
All libraries loaded.")
print("NumPy:", np.__version__)
print("Pandas:", pd.__version__)
print("Seaborn:", sns.__version__)
print("SciPy:", scipy.__version__)
โ All libraries loaded. NumPy: 1.26.4 Pandas: 2.3.0+4.g1dfc98e16a Seaborn: 0.11.2 SciPy: 1.13.1
df = pd.read_csv('water_potability.csv')
df.head(50)
| ph | Hardness | Solids | Chloramines | Sulfate | Conductivity | Organic_carbon | Trihalomethanes | Turbidity | Potability | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | NaN | 204.890455 | 20791.318981 | 7.300212 | 368.516441 | 564.308654 | 10.379783 | 86.990970 | 2.963135 | 0 |
| 1 | 3.716080 | 129.422921 | 18630.057858 | 6.635246 | NaN | 592.885359 | 15.180013 | 56.329076 | 4.500656 | 0 |
| 2 | 8.099124 | 224.236259 | 19909.541732 | 9.275884 | NaN | 418.606213 | 16.868637 | 66.420093 | 3.055934 | 0 |
| 3 | 8.316766 | 214.373394 | 22018.417441 | 8.059332 | 356.886136 | 363.266516 | 18.436524 | 100.341674 | 4.628771 | 0 |
| 4 | 9.092223 | 181.101509 | 17978.986339 | 6.546600 | 310.135738 | 398.410813 | 11.558279 | 31.997993 | 4.075075 | 0 |
| 5 | 5.584087 | 188.313324 | 28748.687739 | 7.544869 | 326.678363 | 280.467916 | 8.399735 | 54.917862 | 2.559708 | 0 |
| 6 | 10.223862 | 248.071735 | 28749.716544 | 7.513408 | 393.663396 | 283.651634 | 13.789695 | 84.603556 | 2.672989 | 0 |
| 7 | 8.635849 | 203.361523 | 13672.091764 | 4.563009 | 303.309771 | 474.607645 | 12.363817 | 62.798309 | 4.401425 | 0 |
| 8 | NaN | 118.988579 | 14285.583854 | 7.804174 | 268.646941 | 389.375566 | 12.706049 | 53.928846 | 3.595017 | 0 |
| 9 | 11.180284 | 227.231469 | 25484.508491 | 9.077200 | 404.041635 | 563.885481 | 17.927806 | 71.976601 | 4.370562 | 0 |
| 10 | 7.360640 | 165.520797 | 32452.614409 | 7.550701 | 326.624353 | 425.383419 | 15.586810 | 78.740016 | 3.662292 | 0 |
| 11 | 7.974522 | 218.693300 | 18767.656682 | 8.110385 | NaN | 364.098230 | 14.525746 | 76.485911 | 4.011718 | 0 |
| 12 | 7.119824 | 156.704993 | 18730.813653 | 3.606036 | 282.344050 | 347.715027 | 15.929536 | 79.500778 | 3.445756 | 0 |
| 13 | NaN | 150.174923 | 27331.361962 | 6.838223 | 299.415781 | 379.761835 | 19.370807 | 76.509996 | 4.413974 | 0 |
| 14 | 7.496232 | 205.344982 | 28388.004887 | 5.072558 | NaN | 444.645352 | 13.228311 | 70.300213 | 4.777382 | 0 |
| 15 | 6.347272 | 186.732881 | 41065.234765 | 9.629596 | 364.487687 | 516.743282 | 11.539781 | 75.071617 | 4.376348 | 0 |
| 16 | 7.051786 | 211.049406 | 30980.600787 | 10.094796 | NaN | 315.141267 | 20.397022 | 56.651604 | 4.268429 | 0 |
| 17 | 9.181560 | 273.813807 | 24041.326280 | 6.904990 | 398.350517 | 477.974642 | 13.387341 | 71.457362 | 4.503661 | 0 |
| 18 | 8.975464 | 279.357167 | 19460.398131 | 6.204321 | NaN | 431.443990 | 12.888759 | 63.821237 | 2.436086 | 0 |
| 19 | 7.371050 | 214.496610 | 25630.320037 | 4.432669 | 335.754439 | 469.914551 | 12.509164 | 62.797277 | 2.560299 | 0 |
| 20 | NaN | 227.435048 | 22305.567414 | 10.333918 | NaN | 554.820086 | 16.331693 | 45.382815 | 4.133423 | 0 |
| 21 | 6.660212 | 168.283747 | 30944.363591 | 5.858769 | 310.930858 | 523.671298 | 17.884235 | 77.042318 | 3.749701 | 0 |
| 22 | NaN | 215.977859 | 17107.224226 | 5.607060 | 326.943978 | 436.256194 | 14.189062 | 59.855476 | 5.459251 | 0 |
| 23 | 3.902476 | 196.903247 | 21167.500099 | 6.996312 | NaN | 444.478883 | 16.609033 | 90.181676 | 4.528523 | 0 |
| 24 | 5.400302 | 140.739062 | 17266.593422 | 10.056852 | 328.358241 | 472.874073 | 11.256381 | 56.931906 | 4.824786 | 0 |
| 25 | 6.514415 | 198.767351 | 21218.702871 | 8.670937 | 323.596349 | 413.290450 | 14.900000 | 79.847843 | 5.200885 | 0 |
| 26 | 3.445062 | 207.926260 | 33424.768678 | 8.782147 | 384.007006 | 441.785876 | 13.805902 | 30.284597 | 4.184397 | 0 |
| 27 | NaN | 145.768181 | 13224.935639 | 7.906445 | 304.001993 | 298.990666 | 12.729525 | 49.536849 | 4.004871 | 0 |
| 28 | NaN | 266.421018 | 26362.965012 | 7.700063 | 395.389490 | 364.480107 | 10.348951 | 53.008381 | 3.991564 | 0 |
| 29 | NaN | 148.153061 | 15193.413474 | 9.046833 | 307.011793 | 563.804743 | 16.568656 | 52.676185 | 6.038185 | 0 |
| 30 | 7.181449 | 209.625601 | 15196.229987 | 5.994679 | 338.336431 | 342.111286 | 7.922598 | 71.537953 | 5.088860 | 0 |
| 31 | 9.825490 | 190.756618 | 19677.892466 | 6.757541 | NaN | 452.836235 | 16.899038 | 47.081971 | 2.857472 | 0 |
| 32 | 10.433291 | 117.791230 | 22326.892046 | 8.161505 | 307.707509 | 412.986834 | 12.890709 | 65.733478 | 5.057311 | 0 |
| 33 | 7.414148 | 235.044534 | 32555.852537 | 6.845952 | 387.175316 | 411.983364 | 10.244815 | 44.489297 | 3.160624 | 0 |
| 34 | NaN | 232.280452 | 14787.206265 | 5.474915 | NaN | 383.981723 | 12.166937 | 86.080727 | 5.029167 | 0 |
| 35 | 5.115817 | 191.952743 | 19620.545329 | 6.060713 | 323.836384 | 441.748379 | 10.966486 | 49.238231 | 3.902089 | 0 |
| 36 | 3.641630 | 183.908722 | 24752.072460 | 5.538314 | 286.059556 | 456.860096 | 9.034067 | 73.594657 | 3.464353 | 0 |
| 37 | 5.618064 | 304.235912 | 17281.975168 | 6.101084 | NaN | 399.471566 | 12.265002 | 81.588992 | 2.896547 | 0 |
| 38 | NaN | 143.453731 | 19942.273218 | 5.890755 | NaN | 427.130671 | 22.469892 | 53.124094 | 2.907564 | 0 |
| 39 | 9.267188 | 198.614395 | 24683.723566 | 6.110612 | 328.077533 | 396.876949 | 16.471969 | 30.383315 | 4.324005 | 0 |
| 40 | NaN | 233.858996 | 11703.923907 | 4.599388 | 309.039320 | 349.399633 | 18.338893 | 42.677465 | 3.510004 | 0 |
| 41 | 5.331940 | 194.874065 | 16658.876503 | 7.993830 | 316.675162 | 335.120398 | 10.180514 | 59.572714 | 4.434820 | 0 |
| 42 | 7.145772 | 238.689929 | 28780.340432 | 6.814029 | 385.975650 | 332.032706 | 11.093163 | 66.138045 | 5.182591 | 0 |
| 43 | 9.920691 | 202.817483 | 9973.934059 | 6.882248 | 337.350529 | 333.192470 | 23.917601 | 71.833624 | 4.690707 | 0 |
| 44 | 4.758439 | 183.349454 | 21568.428779 | 4.731349 | NaN | 403.944168 | 18.668229 | 66.912400 | 4.542801 | 0 |
| 45 | 5.702926 | 216.850474 | 35606.440177 | 7.184351 | NaN | 504.638260 | 16.140790 | 77.536184 | 4.137739 | 0 |
| 46 | 6.953864 | 209.638293 | 10575.186281 | 4.462707 | 315.606594 | 391.184315 | 13.285334 | 87.390889 | 3.195710 | 0 |
| 47 | 10.682966 | 173.375498 | 15758.740621 | 5.570784 | 307.352586 | 323.807913 | 10.090870 | 78.472784 | 3.999775 | 0 |
| 48 | NaN | 129.890572 | 34415.853146 | 6.321929 | 304.535224 | 470.329169 | 18.599410 | 72.403634 | 4.405586 | 0 |
| 49 | 8.757257 | 200.191400 | 21536.224687 | 4.915101 | 317.882900 | 404.717799 | 13.768323 | 47.930872 | 3.626135 | 0 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3276 entries, 0 to 3275 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ph 2785 non-null float64 1 Hardness 3276 non-null float64 2 Solids 3276 non-null float64 3 Chloramines 3276 non-null float64 4 Sulfate 2495 non-null float64 5 Conductivity 3276 non-null float64 6 Organic_carbon 3276 non-null float64 7 Trihalomethanes 3114 non-null float64 8 Turbidity 3276 non-null float64 9 Potability 3276 non-null int64 dtypes: float64(9), int64(1) memory usage: 256.1 KB
df[df['Potability'] == 1]
| ph | Hardness | Solids | Chloramines | Sulfate | Conductivity | Organic_carbon | Trihalomethanes | Turbidity | Potability | |
|---|---|---|---|---|---|---|---|---|---|---|
| 250 | 9.445130 | 145.805402 | 13168.529156 | 9.444471 | 310.583374 | 592.659021 | 8.606397 | 77.577460 | 3.875165 | 1 |
| 251 | 9.024845 | 128.096691 | 19859.676476 | 8.016423 | 300.150377 | 451.143481 | 14.770863 | 73.778026 | 3.985251 | 1 |
| 252 | NaN | 169.974849 | 23403.637304 | 8.519730 | NaN | 475.573562 | 12.924107 | 50.861913 | 2.747313 | 1 |
| 253 | 6.800119 | 242.008082 | 39143.403329 | 9.501695 | 187.170714 | 376.456593 | 11.432466 | 73.777275 | 3.854940 | 1 |
| 254 | 7.174135 | 203.408935 | 20401.102461 | 7.681806 | 287.085679 | 315.549900 | 14.533510 | 74.405616 | 3.939896 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3271 | 4.668102 | 193.681735 | 47580.991603 | 7.166639 | 359.948574 | 526.424171 | 13.894419 | 66.687695 | 4.435821 | 1 |
| 3272 | 7.808856 | 193.553212 | 17329.802160 | 8.061362 | NaN | 392.449580 | 19.903225 | NaN | 2.798243 | 1 |
| 3273 | 9.419510 | 175.762646 | 33155.578218 | 7.350233 | NaN | 432.044783 | 11.039070 | 69.845400 | 3.298875 | 1 |
| 3274 | 5.126763 | 230.603758 | 11983.869376 | 6.303357 | NaN | 402.883113 | 11.168946 | 77.488213 | 4.708658 | 1 |
| 3275 | 7.874671 | 195.102299 | 17404.177061 | 7.509306 | NaN | 327.459760 | 16.140368 | 78.698446 | 2.309149 | 1 |
1278 rows ร 10 columns
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3276 entries, 0 to 3275 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ph 2785 non-null float64 1 Hardness 3276 non-null float64 2 Solids 3276 non-null float64 3 Chloramines 3276 non-null float64 4 Sulfate 2495 non-null float64 5 Conductivity 3276 non-null float64 6 Organic_carbon 3276 non-null float64 7 Trihalomethanes 3114 non-null float64 8 Turbidity 3276 non-null float64 9 Potability 3276 non-null int64 dtypes: float64(9), int64(1) memory usage: 256.1 KB
# check for duplicated rows
df.duplicated().sum()
0
df.describe()
| ph | Hardness | Solids | Chloramines | Sulfate | Conductivity | Organic_carbon | Trihalomethanes | Turbidity | Potability | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 2785.000000 | 3276.000000 | 3276.000000 | 3276.000000 | 2495.000000 | 3276.000000 | 3276.000000 | 3114.000000 | 3276.000000 | 3276.000000 |
| mean | 7.080795 | 196.369496 | 22014.092526 | 7.122277 | 333.775777 | 426.205111 | 14.284970 | 66.396293 | 3.966786 | 0.390110 |
| std | 1.594320 | 32.879761 | 8768.570828 | 1.583085 | 41.416840 | 80.824064 | 3.308162 | 16.175008 | 0.780382 | 0.487849 |
| min | 0.000000 | 47.432000 | 320.942611 | 0.352000 | 129.000000 | 181.483754 | 2.200000 | 0.738000 | 1.450000 | 0.000000 |
| 25% | 6.093092 | 176.850538 | 15666.690297 | 6.127421 | 307.699498 | 365.734414 | 12.065801 | 55.844536 | 3.439711 | 0.000000 |
| 50% | 7.036752 | 196.967627 | 20927.833607 | 7.130299 | 333.073546 | 421.884968 | 14.218338 | 66.622485 | 3.955028 | 0.000000 |
| 75% | 8.062066 | 216.667456 | 27332.762127 | 8.114887 | 359.950170 | 481.792304 | 16.557652 | 77.337473 | 4.500320 | 1.000000 |
| max | 14.000000 | 323.124000 | 61227.196008 | 13.127000 | 481.030642 | 753.342620 | 28.300000 | 124.000000 | 6.739000 | 1.000000 |
df.isna().sum()
ph 491 Hardness 0 Solids 0 Chloramines 0 Sulfate 781 Conductivity 0 Organic_carbon 0 Trihalomethanes 162 Turbidity 0 Potability 0 dtype: int64
df.dtypes
ph float64 Hardness float64 Solids float64 Chloramines float64 Sulfate float64 Conductivity float64 Organic_carbon float64 Trihalomethanes float64 Turbidity float64 Potability int64 dtype: object
df['ph'].fillna(df['ph'].median(), inplace=True)
df['Sulfate'].fillna(df['Sulfate'].median(), inplace=True)
df['Trihalomethanes'].fillna(df['Trihalomethanes'].mean(), inplace=True)
C:\Users\asus\AppData\Local\Temp\ipykernel_16508\1952783639.py:1: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
df['ph'].fillna(df['ph'].median(), inplace=True)
C:\Users\asus\AppData\Local\Temp\ipykernel_16508\1952783639.py:2: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
df['Sulfate'].fillna(df['Sulfate'].median(), inplace=True)
C:\Users\asus\AppData\Local\Temp\ipykernel_16508\1952783639.py:3: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
df['Trihalomethanes'].fillna(df['Trihalomethanes'].mean(), inplace=True)
df.isna().sum()
ph 0 Hardness 0 Solids 0 Chloramines 0 Sulfate 0 Conductivity 0 Organic_carbon 0 Trihalomethanes 0 Turbidity 0 Potability 0 dtype: int64
df[df['Potability'] == 1]
| ph | Hardness | Solids | Chloramines | Sulfate | Conductivity | Organic_carbon | Trihalomethanes | Turbidity | Potability | |
|---|---|---|---|---|---|---|---|---|---|---|
| 250 | 9.445130 | 145.805402 | 13168.529156 | 9.444471 | 310.583374 | 592.659021 | 8.606397 | 77.577460 | 3.875165 | 1 |
| 251 | 9.024845 | 128.096691 | 19859.676476 | 8.016423 | 300.150377 | 451.143481 | 14.770863 | 73.778026 | 3.985251 | 1 |
| 252 | 7.036752 | 169.974849 | 23403.637304 | 8.519730 | 333.073546 | 475.573562 | 12.924107 | 50.861913 | 2.747313 | 1 |
| 253 | 6.800119 | 242.008082 | 39143.403329 | 9.501695 | 187.170714 | 376.456593 | 11.432466 | 73.777275 | 3.854940 | 1 |
| 254 | 7.174135 | 203.408935 | 20401.102461 | 7.681806 | 287.085679 | 315.549900 | 14.533510 | 74.405616 | 3.939896 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3271 | 4.668102 | 193.681735 | 47580.991603 | 7.166639 | 359.948574 | 526.424171 | 13.894419 | 66.687695 | 4.435821 | 1 |
| 3272 | 7.808856 | 193.553212 | 17329.802160 | 8.061362 | 333.073546 | 392.449580 | 19.903225 | 66.396293 | 2.798243 | 1 |
| 3273 | 9.419510 | 175.762646 | 33155.578218 | 7.350233 | 333.073546 | 432.044783 | 11.039070 | 69.845400 | 3.298875 | 1 |
| 3274 | 5.126763 | 230.603758 | 11983.869376 | 6.303357 | 333.073546 | 402.883113 | 11.168946 | 77.488213 | 4.708658 | 1 |
| 3275 | 7.874671 | 195.102299 | 17404.177061 | 7.509306 | 333.073546 | 327.459760 | 16.140368 | 78.698446 | 2.309149 | 1 |
1278 rows ร 10 columns
# Example: pH should not be 0 in real water
df[df['ph'] == 0]
| ph | Hardness | Solids | Chloramines | Sulfate | Conductivity | Organic_carbon | Trihalomethanes | Turbidity | Potability | |
|---|---|---|---|---|---|---|---|---|---|---|
| 3014 | 0.0 | 214.846144 | 49456.587108 | 7.897539 | 333.073546 | 583.448849 | 7.702328 | 77.712891 | 4.92884 | 0 |
df.loc[df['ph'] == 0, 'ph'] = np.nan
df['ph'].fillna(df['ph'].median(), inplace=True)
C:\Users\asus\AppData\Local\Temp\ipykernel_16508\1201209032.py:2: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
df['ph'].fillna(df['ph'].median(), inplace=True)
df[df['ph'] == 0]
| ph | Hardness | Solids | Chloramines | Sulfate | Conductivity | Organic_carbon | Trihalomethanes | Turbidity | Potability |
|---|
df.describe()
| ph | Hardness | Solids | Chloramines | Sulfate | Conductivity | Organic_carbon | Trihalomethanes | Turbidity | Potability | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 3276.000000 | 3276.000000 | 3276.000000 | 3276.000000 | 3276.000000 | 3276.000000 | 3276.000000 | 3276.000000 | 3276.000000 | 3276.000000 |
| mean | 7.076341 | 196.369496 | 22014.092526 | 7.122277 | 333.608364 | 426.205111 | 14.284970 | 66.396293 | 3.966786 | 0.390110 |
| std | 1.464832 | 32.879761 | 8768.570828 | 1.583085 | 36.143851 | 80.824064 | 3.308162 | 15.769881 | 0.780382 | 0.487849 |
| min | 0.227499 | 47.432000 | 320.942611 | 0.352000 | 129.000000 | 181.483754 | 2.200000 | 0.738000 | 1.450000 | 0.000000 |
| 25% | 6.279317 | 176.850538 | 15666.690297 | 6.127421 | 317.094638 | 365.734414 | 12.065801 | 56.647656 | 3.439711 | 0.000000 |
| 50% | 7.036752 | 196.967627 | 20927.833607 | 7.130299 | 333.073546 | 421.884968 | 14.218338 | 66.396293 | 3.955028 | 0.000000 |
| 75% | 7.870050 | 216.667456 | 27332.762127 | 8.114887 | 350.385756 | 481.792304 | 16.557652 | 76.666609 | 4.500320 | 1.000000 |
| max | 14.000000 | 323.124000 | 61227.196008 | 13.127000 | 481.030642 | 753.342620 | 28.300000 | 124.000000 | 6.739000 | 1.000000 |
# Total number of rows
total_samples = len(df)
# Number of safe water samples (Potability = 1)
safe_samples = df[df['Potability'] == 1].shape[0]
# Calculate percentage
safe_percentage = (safe_samples / total_samples) * 100
print(f"Safe water samples: {safe_samples} out of {total_samples} ({safe_percentage:.2f}%)")
Safe water samples: 1278 out of 3276 (39.01%)
# Step 1: Remove old Potability column
df.drop(columns=['Potability'], inplace=True)
# Step 2: Scoring Function (based on WHO recommended ranges)
def calculate_potability_score(row):
score = 0
if 6.5 <= row['ph'] <= 8.5: score += 15
if row['Solids'] < 500: score += 10
if row['Chloramines'] < 4: score += 10
if row['Sulfate'] < 250: score += 10
if row['Trihalomethanes'] < 80: score += 10
if row['Organic_carbon'] < 5: score += 10
if row['Conductivity'] < 1500: score += 10
if row['Turbidity'] < 5: score += 10
if row['Hardness'] < 120: score += 5
return score
# Step 3: Apply score to each row
df['Potability_Score'] = df.apply(calculate_potability_score, axis=1)
# Step 4: Map score into categories
def map_potability_level(score):
if score < 41:
return 'Very Poor'
elif score < 61:
return 'Poor'
elif score < 76:
return 'Average'
elif score < 91:
return 'Good'
else:
return 'Very Good'
df['Potability_Level'] = df['Potability_Score'].apply(map_potability_level)
# Step 5: Ensure correct data types
df['Potability_Score'] = df['Potability_Score'].astype(int)
df['Potability_Level'] = df['Potability_Level'].astype('category')
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(8,5))
sns.countplot(x='Potability_Level', data=df, palette='Spectral', order=df['Potability_Level'].value_counts().index)
plt.title("Distribution of Water Quality Levels")
plt.xlabel("Potability Level")
plt.ylabel("Number of Samples")
plt.grid(True, axis='y')
plt.tight_layout()
plt.show()
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\categorical.py:253: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. grouped_vals = vals.groupby(grouper)
import pandas as pd
import plotly.express as px
from sklearn.preprocessing import MinMaxScaler
# Step 1: Only keep very poor water samples
very_poor_df = df[df["Potability_Level"] == "Very Poor"]
# Step 2: Select only chemical columns
chemical_cols = ['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate',
'Conductivity', 'Organic_carbon', 'Trihalomethanes', 'Turbidity']
# Step 3: Normalize all columns to 0โ10 scale using MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 10))
normalized_data = scaler.fit_transform(very_poor_df[chemical_cols])
# Step 4: Take mean of each chemical (for Very Poor category)
mean_normalized = pd.DataFrame(normalized_data, columns=chemical_cols).mean().sort_values(ascending=False)
# Step 5: Create Plotly bar chart
fig = px.bar(
x=mean_normalized.values,
y=mean_normalized.index,
orientation='h',
labels={'x': 'Normalized Mean Value (0โ10)', 'y': 'Chemical'},
title='Normalized Chemical Levels in Very Poor Water Samples',
text=mean_normalized.round(2)
)
fig.update_traces(marker_color='indianred', textposition='outside')
fig.update_layout(xaxis_range=[0, 10])
fig.show()
# ๐ฆ 1. Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
# ๐ 2. Load and Clean Dataset
df = pd.read_csv("water_potability.csv")
df.fillna(df.median(numeric_only=True), inplace=True)
# ๐ 3. Define Custom Scoring Function (Safe Bonus + Danger Penalty)
def custom_score(row):
score = 0
# โ
Bonus: Safe ranges = +10 each
if 6.5 <= row['ph'] <= 8.5: score += 10
if row['Hardness'] <= 300: score += 10
if row['Solids'] <= 5000: score += 10
if row['Chloramines'] <= 4: score += 10
if row['Sulfate'] <= 250: score += 10
if row['Conductivity'] <= 800: score += 10
if row['Organic_carbon'] <= 10: score += 10
if row['Trihalomethanes'] <= 80: score += 10
if row['Turbidity'] <= 5: score += 10
# โ Penalty: More dangerous = more negative score
def penalty(value, limit, factor=1):
return max(0, (value - limit) // (limit * factor) + 1) * 10
if row['Trihalomethanes'] > 80:
score -= penalty(row['Trihalomethanes'], 80, factor=0.5)
if row['Turbidity'] > 5:
score -= penalty(row['Turbidity'], 5, factor=0.2)
if row['Chloramines'] > 4:
score -= penalty(row['Chloramines'], 4, factor=0.5)
if row['Sulfate'] > 250:
score -= penalty(row['Sulfate'], 250, factor=0.5)
if row['Conductivity'] > 800:
score -= penalty(row['Conductivity'], 800, factor=0.5)
if row['Solids'] > 5000:
score -= penalty(row['Solids'], 5000, factor=1)
if row['Organic_carbon'] > 10:
score -= penalty(row['Organic_carbon'], 10, factor=0.5)
if 6.5 <= row['ph'] <= 8.5:
score += 10
else:
score -= 10
if row['Hardness'] > 300:
score -= 10
return max(score, 0)
# ๐งฎ 4. Apply Scoring
df['Potability_Score'] = df.apply(custom_score, axis=1)
# ๐ 5. Categorize Potability Level
def categorize(score):
if score >= 80:
return "Very Good"
elif score >= 60:
return "Good"
elif score >= 40:
return "Average"
elif score >= 20:
return "Poor"
else:
return "Very Poor"
df['Potability_Level'] = df['Potability_Score'].apply(categorize)
# ๐ 6. Split Dataset for Any Further Use (e.g., model validation if needed)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
# ๐ค 7. Function to Predict from User Input
def predict_potability(user_input):
row = pd.Series(user_input)
score = custom_score(row)
level = categorize(score)
return score, level
# ๐งช 8. Example Prediction
user_input = {
'ph': 7.2, # 6.5 โ 8.5
'Hardness': 100, # 60 โ 120 mg/L (as CaCOโ)
'Solids': 400, # < 500 mg/L
'Chloramines': 2.5, # < 4.0 mg/L
'Sulfate': 210, # < 250 mg/L
'Conductivity': 700, # < 800 ยตS/cm
'Organic_carbon': 4, # < 5 mg/L
'Trihalomethanes': 70, # < 80 ยตg/L
'Turbidity': 3.5 # < 5 NTU (preferably < 1 NTU)
}
score, level = predict_potability(user_input)
print(f"Predicted Score: {score} / 100")
print(f"Water Quality: {level}")
Predicted Score: 100 / 100 Water Quality: Very Good